import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option("display.max_columns",250)
pd.set_option("display.max_rows",250)
pd.set_option("display.width",2000)
previous_application=pd.read_csv('E:/VCET/Trainity Intern/Project 6 Bank Loan Case Study/previous_application.csv')
application_data=pd.read_csv('E:/VCET/Trainity Intern/Project 6 Bank Loan Case Study/application_data.csv')
previous_application.shape
(1670214, 37)
application_data.shape
(307511, 122)
previous_application.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1670214 entries, 0 to 1670213 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_PREV 1670214 non-null int64 1 SK_ID_CURR 1670214 non-null int64 2 NAME_CONTRACT_TYPE 1670214 non-null object 3 AMT_ANNUITY 1297979 non-null float64 4 AMT_APPLICATION 1670214 non-null float64 5 AMT_CREDIT 1670213 non-null float64 6 AMT_DOWN_PAYMENT 774370 non-null float64 7 AMT_GOODS_PRICE 1284699 non-null float64 8 WEEKDAY_APPR_PROCESS_START 1670214 non-null object 9 HOUR_APPR_PROCESS_START 1670214 non-null int64 10 FLAG_LAST_APPL_PER_CONTRACT 1670214 non-null object 11 NFLAG_LAST_APPL_IN_DAY 1670214 non-null int64 12 RATE_DOWN_PAYMENT 774370 non-null float64 13 RATE_INTEREST_PRIMARY 5951 non-null float64 14 RATE_INTEREST_PRIVILEGED 5951 non-null float64 15 NAME_CASH_LOAN_PURPOSE 1670214 non-null object 16 NAME_CONTRACT_STATUS 1670214 non-null object 17 DAYS_DECISION 1670214 non-null int64 18 NAME_PAYMENT_TYPE 1670214 non-null object 19 CODE_REJECT_REASON 1670214 non-null object 20 NAME_TYPE_SUITE 849809 non-null object 21 NAME_CLIENT_TYPE 1670214 non-null object 22 NAME_GOODS_CATEGORY 1670214 non-null object 23 NAME_PORTFOLIO 1670214 non-null object 24 NAME_PRODUCT_TYPE 1670214 non-null object 25 CHANNEL_TYPE 1670214 non-null object 26 SELLERPLACE_AREA 1670214 non-null int64 27 NAME_SELLER_INDUSTRY 1670214 non-null object 28 CNT_PAYMENT 1297984 non-null float64 29 NAME_YIELD_GROUP 1670214 non-null object 30 PRODUCT_COMBINATION 1669868 non-null object 31 DAYS_FIRST_DRAWING 997149 non-null float64 32 DAYS_FIRST_DUE 997149 non-null float64 33 DAYS_LAST_DUE_1ST_VERSION 997149 non-null float64 34 DAYS_LAST_DUE 997149 non-null float64 35 DAYS_TERMINATION 997149 non-null float64 36 NFLAG_INSURED_ON_APPROVAL 997149 non-null float64 dtypes: float64(15), int64(6), object(16) memory usage: 471.5+ MB
application_data.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 122 columns): # Column Dtype --- ------ ----- 0 SK_ID_CURR int64 1 TARGET int64 2 NAME_CONTRACT_TYPE object 3 CODE_GENDER object 4 FLAG_OWN_CAR object 5 FLAG_OWN_REALTY object 6 CNT_CHILDREN int64 7 AMT_INCOME_TOTAL float64 8 AMT_CREDIT float64 9 AMT_ANNUITY float64 10 AMT_GOODS_PRICE float64 11 NAME_TYPE_SUITE object 12 NAME_INCOME_TYPE object 13 NAME_EDUCATION_TYPE object 14 NAME_FAMILY_STATUS object 15 NAME_HOUSING_TYPE object 16 REGION_POPULATION_RELATIVE float64 17 DAYS_BIRTH int64 18 DAYS_EMPLOYED int64 19 DAYS_REGISTRATION float64 20 DAYS_ID_PUBLISH int64 21 OWN_CAR_AGE float64 22 FLAG_MOBIL int64 23 FLAG_EMP_PHONE int64 24 FLAG_WORK_PHONE int64 25 FLAG_CONT_MOBILE int64 26 FLAG_PHONE int64 27 FLAG_EMAIL int64 28 OCCUPATION_TYPE object 29 CNT_FAM_MEMBERS float64 30 REGION_RATING_CLIENT int64 31 REGION_RATING_CLIENT_W_CITY int64 32 WEEKDAY_APPR_PROCESS_START object 33 HOUR_APPR_PROCESS_START int64 34 REG_REGION_NOT_LIVE_REGION int64 35 REG_REGION_NOT_WORK_REGION int64 36 LIVE_REGION_NOT_WORK_REGION int64 37 REG_CITY_NOT_LIVE_CITY int64 38 REG_CITY_NOT_WORK_CITY int64 39 LIVE_CITY_NOT_WORK_CITY int64 40 ORGANIZATION_TYPE object 41 EXT_SOURCE_1 float64 42 EXT_SOURCE_2 float64 43 EXT_SOURCE_3 float64 44 APARTMENTS_AVG float64 45 BASEMENTAREA_AVG float64 46 YEARS_BEGINEXPLUATATION_AVG float64 47 YEARS_BUILD_AVG float64 48 COMMONAREA_AVG float64 49 ELEVATORS_AVG float64 50 ENTRANCES_AVG float64 51 FLOORSMAX_AVG float64 52 FLOORSMIN_AVG float64 53 LANDAREA_AVG float64 54 LIVINGAPARTMENTS_AVG float64 55 LIVINGAREA_AVG float64 56 NONLIVINGAPARTMENTS_AVG float64 57 NONLIVINGAREA_AVG float64 58 APARTMENTS_MODE float64 59 BASEMENTAREA_MODE float64 60 YEARS_BEGINEXPLUATATION_MODE float64 61 YEARS_BUILD_MODE float64 62 COMMONAREA_MODE float64 63 ELEVATORS_MODE float64 64 ENTRANCES_MODE float64 65 FLOORSMAX_MODE float64 66 FLOORSMIN_MODE float64 67 LANDAREA_MODE float64 68 LIVINGAPARTMENTS_MODE float64 69 LIVINGAREA_MODE float64 70 NONLIVINGAPARTMENTS_MODE float64 71 NONLIVINGAREA_MODE float64 72 APARTMENTS_MEDI float64 73 BASEMENTAREA_MEDI float64 74 YEARS_BEGINEXPLUATATION_MEDI float64 75 YEARS_BUILD_MEDI float64 76 COMMONAREA_MEDI float64 77 ELEVATORS_MEDI float64 78 ENTRANCES_MEDI float64 79 FLOORSMAX_MEDI float64 80 FLOORSMIN_MEDI float64 81 LANDAREA_MEDI float64 82 LIVINGAPARTMENTS_MEDI float64 83 LIVINGAREA_MEDI float64 84 NONLIVINGAPARTMENTS_MEDI float64 85 NONLIVINGAREA_MEDI float64 86 FONDKAPREMONT_MODE object 87 HOUSETYPE_MODE object 88 TOTALAREA_MODE float64 89 WALLSMATERIAL_MODE object 90 EMERGENCYSTATE_MODE object 91 OBS_30_CNT_SOCIAL_CIRCLE float64 92 DEF_30_CNT_SOCIAL_CIRCLE float64 93 OBS_60_CNT_SOCIAL_CIRCLE float64 94 DEF_60_CNT_SOCIAL_CIRCLE float64 95 DAYS_LAST_PHONE_CHANGE float64 96 FLAG_DOCUMENT_2 int64 97 FLAG_DOCUMENT_3 int64 98 FLAG_DOCUMENT_4 int64 99 FLAG_DOCUMENT_5 int64 100 FLAG_DOCUMENT_6 int64 101 FLAG_DOCUMENT_7 int64 102 FLAG_DOCUMENT_8 int64 103 FLAG_DOCUMENT_9 int64 104 FLAG_DOCUMENT_10 int64 105 FLAG_DOCUMENT_11 int64 106 FLAG_DOCUMENT_12 int64 107 FLAG_DOCUMENT_13 int64 108 FLAG_DOCUMENT_14 int64 109 FLAG_DOCUMENT_15 int64 110 FLAG_DOCUMENT_16 int64 111 FLAG_DOCUMENT_17 int64 112 FLAG_DOCUMENT_18 int64 113 FLAG_DOCUMENT_19 int64 114 FLAG_DOCUMENT_20 int64 115 FLAG_DOCUMENT_21 int64 116 AMT_REQ_CREDIT_BUREAU_HOUR float64 117 AMT_REQ_CREDIT_BUREAU_DAY float64 118 AMT_REQ_CREDIT_BUREAU_WEEK float64 119 AMT_REQ_CREDIT_BUREAU_MON float64 120 AMT_REQ_CREDIT_BUREAU_QRT float64 121 AMT_REQ_CREDIT_BUREAU_YEAR float64 dtypes: float64(65), int64(41), object(16) memory usage: 286.2+ MB
previous_application.describe()
| SK_ID_PREV | SK_ID_CURR | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | HOUR_APPR_PROCESS_START | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | DAYS_DECISION | SELLERPLACE_AREA | CNT_PAYMENT | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.670214e+06 | 1.670214e+06 | 1.297979e+06 | 1.670214e+06 | 1.670213e+06 | 7.743700e+05 | 1.284699e+06 | 1.670214e+06 | 1.670214e+06 | 774370.000000 | 5951.000000 | 5951.000000 | 1.670214e+06 | 1.670214e+06 | 1.297984e+06 | 997149.000000 | 997149.000000 | 997149.000000 | 997149.000000 | 997149.000000 | 997149.000000 |
| mean | 1.923089e+06 | 2.783572e+05 | 1.595512e+04 | 1.752339e+05 | 1.961140e+05 | 6.697402e+03 | 2.278473e+05 | 1.248418e+01 | 9.964675e-01 | 0.079637 | 0.188357 | 0.773503 | -8.806797e+02 | 3.139511e+02 | 1.605408e+01 | 342209.855039 | 13826.269337 | 33767.774054 | 76582.403064 | 81992.343838 | 0.332570 |
| std | 5.325980e+05 | 1.028148e+05 | 1.478214e+04 | 2.927798e+05 | 3.185746e+05 | 2.092150e+04 | 3.153966e+05 | 3.334028e+00 | 5.932963e-02 | 0.107823 | 0.087671 | 0.100879 | 7.790997e+02 | 7.127443e+03 | 1.456729e+01 | 88916.115834 | 72444.869708 | 106857.034789 | 149647.415123 | 153303.516729 | 0.471134 |
| min | 1.000001e+06 | 1.000010e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -9.000000e-01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | -0.000015 | 0.034781 | 0.373150 | -2.922000e+03 | -1.000000e+00 | 0.000000e+00 | -2922.000000 | -2892.000000 | -2801.000000 | -2889.000000 | -2874.000000 | 0.000000 |
| 25% | 1.461857e+06 | 1.893290e+05 | 6.321780e+03 | 1.872000e+04 | 2.416050e+04 | 0.000000e+00 | 5.084100e+04 | 1.000000e+01 | 1.000000e+00 | 0.000000 | 0.160716 | 0.715645 | -1.300000e+03 | -1.000000e+00 | 6.000000e+00 | 365243.000000 | -1628.000000 | -1242.000000 | -1314.000000 | -1270.000000 | 0.000000 |
| 50% | 1.923110e+06 | 2.787145e+05 | 1.125000e+04 | 7.104600e+04 | 8.054100e+04 | 1.638000e+03 | 1.123200e+05 | 1.200000e+01 | 1.000000e+00 | 0.051605 | 0.189122 | 0.835095 | -5.810000e+02 | 3.000000e+00 | 1.200000e+01 | 365243.000000 | -831.000000 | -361.000000 | -537.000000 | -499.000000 | 0.000000 |
| 75% | 2.384280e+06 | 3.675140e+05 | 2.065842e+04 | 1.803600e+05 | 2.164185e+05 | 7.740000e+03 | 2.340000e+05 | 1.500000e+01 | 1.000000e+00 | 0.108909 | 0.193330 | 0.852537 | -2.800000e+02 | 8.200000e+01 | 2.400000e+01 | 365243.000000 | -411.000000 | 129.000000 | -74.000000 | -44.000000 | 1.000000 |
| max | 2.845382e+06 | 4.562550e+05 | 4.180581e+05 | 6.905160e+06 | 6.905160e+06 | 3.060045e+06 | 6.905160e+06 | 2.300000e+01 | 1.000000e+00 | 1.000000 | 1.000000 | 1.000000 | -1.000000e+00 | 4.000000e+06 | 8.400000e+01 | 365243.000000 | 365243.000000 | 365243.000000 | 365243.000000 | 365243.000000 | 1.000000 |
application_data.describe()
| SK_ID_CURR | TARGET | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | TOTALAREA_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 307511.000000 | 307511.000000 | 307511.000000 | 3.075110e+05 | 3.075110e+05 | 307499.000000 | 3.072330e+05 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 104582.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307509.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 134133.000000 | 3.068510e+05 | 246546.000000 | 151450.00000 | 127568.000000 | 157504.000000 | 103023.000000 | 92646.000000 | 143620.000000 | 152683.000000 | 154491.000000 | 98869.000000 | 124921.000000 | 97312.000000 | 153161.000000 | 93997.000000 | 137829.000000 | 151450.000000 | 127568.000000 | 157504.000000 | 103023.000000 | 92646.000000 | 143620.000000 | 152683.000000 | 154491.000000 | 98869.000000 | 124921.000000 | 97312.000000 | 153161.000000 | 93997.000000 | 137829.000000 | 151450.000000 | 127568.000000 | 157504.000000 | 103023.000000 | 92646.000000 | 143620.000000 | 152683.000000 | 154491.000000 | 98869.000000 | 124921.000000 | 97312.000000 | 153161.000000 | 93997.000000 | 137829.000000 | 159080.000000 | 306490.000000 | 306490.000000 | 306490.000000 | 306490.000000 | 307510.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.00000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 307511.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 | 265992.000000 |
| mean | 278180.518577 | 0.080729 | 0.417052 | 1.687979e+05 | 5.990260e+05 | 27108.573909 | 5.383962e+05 | 0.020868 | -16036.995067 | 63815.045904 | -4986.120328 | -2994.202373 | 12.061091 | 0.999997 | 0.819889 | 0.199368 | 0.998133 | 0.281066 | 0.056720 | 2.152665 | 2.052463 | 2.031521 | 12.063419 | 0.015144 | 0.050769 | 0.040659 | 0.078173 | 0.230454 | 0.179555 | 0.502130 | 5.143927e-01 | 0.510853 | 0.11744 | 0.088442 | 0.977735 | 0.752471 | 0.044621 | 0.078942 | 0.149725 | 0.226282 | 0.231894 | 0.066333 | 0.100775 | 0.107399 | 0.008809 | 0.028358 | 0.114231 | 0.087543 | 0.977065 | 0.759637 | 0.042553 | 0.074490 | 0.145193 | 0.222315 | 0.228058 | 0.064958 | 0.105645 | 0.105975 | 0.008076 | 0.027022 | 0.117850 | 0.087955 | 0.977752 | 0.755746 | 0.044595 | 0.078078 | 0.149213 | 0.225897 | 0.231625 | 0.067169 | 0.101954 | 0.108607 | 0.008651 | 0.028236 | 0.102547 | 1.422245 | 0.143421 | 1.405292 | 0.100049 | -962.858788 | 0.000042 | 0.710023 | 0.000081 | 0.015115 | 0.088055 | 0.000192 | 0.081376 | 0.003896 | 0.000023 | 0.003912 | 0.000007 | 0.003525 | 0.002936 | 0.00121 | 0.009928 | 0.000267 | 0.008130 | 0.000595 | 0.000507 | 0.000335 | 0.006402 | 0.007000 | 0.034362 | 0.267395 | 0.265474 | 1.899974 |
| std | 102790.175348 | 0.272419 | 0.722121 | 2.371231e+05 | 4.024908e+05 | 14493.737315 | 3.694465e+05 | 0.013831 | 4363.988632 | 141275.766519 | 3522.886321 | 1509.450419 | 11.944812 | 0.001803 | 0.384280 | 0.399526 | 0.043164 | 0.449521 | 0.231307 | 0.910682 | 0.509034 | 0.502737 | 3.265832 | 0.122126 | 0.219526 | 0.197499 | 0.268444 | 0.421124 | 0.383817 | 0.211062 | 1.910602e-01 | 0.194844 | 0.10824 | 0.082438 | 0.059223 | 0.113280 | 0.076036 | 0.134576 | 0.100049 | 0.144641 | 0.161380 | 0.081184 | 0.092576 | 0.110565 | 0.047732 | 0.069523 | 0.107936 | 0.084307 | 0.064575 | 0.110111 | 0.074445 | 0.132256 | 0.100977 | 0.143709 | 0.161160 | 0.081750 | 0.097880 | 0.111845 | 0.046276 | 0.070254 | 0.109076 | 0.082179 | 0.059897 | 0.112066 | 0.076144 | 0.134467 | 0.100368 | 0.145067 | 0.161934 | 0.082167 | 0.093642 | 0.112260 | 0.047415 | 0.070166 | 0.107462 | 2.400989 | 0.446698 | 2.379803 | 0.362291 | 826.808487 | 0.006502 | 0.453752 | 0.009016 | 0.122010 | 0.283376 | 0.013850 | 0.273412 | 0.062295 | 0.004771 | 0.062424 | 0.002550 | 0.059268 | 0.054110 | 0.03476 | 0.099144 | 0.016327 | 0.089798 | 0.024387 | 0.022518 | 0.018299 | 0.083849 | 0.110757 | 0.204685 | 0.916002 | 0.794056 | 1.869295 |
| min | 100002.000000 | 0.000000 | 0.000000 | 2.565000e+04 | 4.500000e+04 | 1615.500000 | 4.050000e+04 | 0.000290 | -25229.000000 | -17912.000000 | -24672.000000 | -7197.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.014568 | 8.173617e-08 | 0.000527 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -4292.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 189145.500000 | 0.000000 | 0.000000 | 1.125000e+05 | 2.700000e+05 | 16524.000000 | 2.385000e+05 | 0.010006 | -19682.000000 | -2760.000000 | -7479.500000 | -4299.000000 | 5.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.000000 | 2.000000 | 2.000000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.334007 | 3.924574e-01 | 0.370650 | 0.05770 | 0.044200 | 0.976700 | 0.687200 | 0.007800 | 0.000000 | 0.069000 | 0.166700 | 0.083300 | 0.018700 | 0.050400 | 0.045300 | 0.000000 | 0.000000 | 0.052500 | 0.040700 | 0.976700 | 0.699400 | 0.007200 | 0.000000 | 0.069000 | 0.166700 | 0.083300 | 0.016600 | 0.054200 | 0.042700 | 0.000000 | 0.000000 | 0.058300 | 0.043700 | 0.976700 | 0.691400 | 0.007900 | 0.000000 | 0.069000 | 0.166700 | 0.083300 | 0.018700 | 0.051300 | 0.045700 | 0.000000 | 0.000000 | 0.041200 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1570.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 278202.000000 | 0.000000 | 0.000000 | 1.471500e+05 | 5.135310e+05 | 24903.000000 | 4.500000e+05 | 0.018850 | -15750.000000 | -1213.000000 | -4504.000000 | -3254.000000 | 9.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.000000 | 2.000000 | 2.000000 | 12.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.505998 | 5.659614e-01 | 0.535276 | 0.08760 | 0.076300 | 0.981600 | 0.755200 | 0.021100 | 0.000000 | 0.137900 | 0.166700 | 0.208300 | 0.048100 | 0.075600 | 0.074500 | 0.000000 | 0.003600 | 0.084000 | 0.074600 | 0.981600 | 0.764800 | 0.019000 | 0.000000 | 0.137900 | 0.166700 | 0.208300 | 0.045800 | 0.077100 | 0.073100 | 0.000000 | 0.001100 | 0.086400 | 0.075800 | 0.981600 | 0.758500 | 0.020800 | 0.000000 | 0.137900 | 0.166700 | 0.208300 | 0.048700 | 0.076100 | 0.074900 | 0.000000 | 0.003100 | 0.068800 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -757.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 75% | 367142.500000 | 0.000000 | 1.000000 | 2.025000e+05 | 8.086500e+05 | 34596.000000 | 6.795000e+05 | 0.028663 | -12413.000000 | -289.000000 | -2010.000000 | -1720.000000 | 15.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 3.000000 | 2.000000 | 2.000000 | 14.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.675053 | 6.636171e-01 | 0.669057 | 0.14850 | 0.112200 | 0.986600 | 0.823200 | 0.051500 | 0.120000 | 0.206900 | 0.333300 | 0.375000 | 0.085600 | 0.121000 | 0.129900 | 0.003900 | 0.027700 | 0.143900 | 0.112400 | 0.986600 | 0.823600 | 0.049000 | 0.120800 | 0.206900 | 0.333300 | 0.375000 | 0.084100 | 0.131300 | 0.125200 | 0.003900 | 0.023100 | 0.148900 | 0.111600 | 0.986600 | 0.825600 | 0.051300 | 0.120000 | 0.206900 | 0.333300 | 0.375000 | 0.086800 | 0.123100 | 0.130300 | 0.003900 | 0.026600 | 0.127600 | 2.000000 | 0.000000 | 2.000000 | 0.000000 | -274.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 |
| max | 456255.000000 | 1.000000 | 19.000000 | 1.170000e+08 | 4.050000e+06 | 258025.500000 | 4.050000e+06 | 0.072508 | -7489.000000 | 365243.000000 | 0.000000 | 0.000000 | 91.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 20.000000 | 3.000000 | 3.000000 | 23.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.962693 | 8.549997e-01 | 0.896010 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 348.000000 | 34.000000 | 344.000000 | 24.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 9.000000 | 8.000000 | 27.000000 | 261.000000 | 25.000000 |
previous_application.agg(['count','size','nunique'])
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1670214 | 1670214 | 1670214 | 1297979 | 1670214 | 1670213 | 774370 | 1284699 | 1670214 | 1670214 | 1670214 | 1670214 | 774370 | 5951 | 5951 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 849809 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1297984 | 1670214 | 1669868 | 997149 | 997149 | 997149 | 997149 | 997149 | 997149 |
| size | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 | 1670214 |
| nunique | 1670214 | 338857 | 4 | 357959 | 93885 | 86803 | 29278 | 93885 | 7 | 24 | 2 | 2 | 207033 | 148 | 25 | 25 | 4 | 2922 | 4 | 9 | 7 | 4 | 28 | 5 | 3 | 8 | 2097 | 11 | 49 | 5 | 17 | 2838 | 2892 | 4605 | 2873 | 2830 | 2 |
application_data.agg(['count','size','nunique'])
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307499 | 307233 | 306219 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 104582 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 211120 | 307509 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 134133 | 306851 | 246546 | 151450 | 127568 | 157504 | 103023 | 92646 | 143620 | 152683 | 154491 | 98869 | 124921 | 97312 | 153161 | 93997 | 137829 | 151450 | 127568 | 157504 | 103023 | 92646 | 143620 | 152683 | 154491 | 98869 | 124921 | 97312 | 153161 | 93997 | 137829 | 151450 | 127568 | 157504 | 103023 | 92646 | 143620 | 152683 | 154491 | 98869 | 124921 | 97312 | 153161 | 93997 | 137829 | 97216 | 153214 | 159080 | 151170 | 161756 | 306490 | 306490 | 306490 | 306490 | 307510 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 265992 | 265992 | 265992 | 265992 | 265992 | 265992 |
| size | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 | 307511 |
| nunique | 307511 | 2 | 2 | 3 | 2 | 2 | 15 | 2548 | 5603 | 13672 | 1002 | 7 | 8 | 5 | 6 | 6 | 81 | 17460 | 12574 | 15688 | 6168 | 62 | 2 | 2 | 2 | 2 | 2 | 2 | 18 | 17 | 3 | 3 | 7 | 24 | 2 | 2 | 2 | 2 | 2 | 2 | 58 | 114584 | 119831 | 814 | 2339 | 3780 | 285 | 149 | 3181 | 257 | 285 | 403 | 305 | 3527 | 1868 | 5199 | 386 | 3290 | 760 | 3841 | 221 | 154 | 3128 | 26 | 30 | 25 | 25 | 3563 | 736 | 5301 | 167 | 3327 | 1148 | 3772 | 245 | 151 | 3202 | 46 | 46 | 49 | 47 | 3560 | 1097 | 5281 | 214 | 3323 | 4 | 3 | 5116 | 7 | 2 | 33 | 10 | 33 | 9 | 3773 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 5 | 9 | 9 | 24 | 11 | 25 |
round(previous_application.isnull().sum()/previous_application.shape[0]*100,2).sort_values(ascending=False)
RATE_INTEREST_PRIVILEGED 99.64 RATE_INTEREST_PRIMARY 99.64 RATE_DOWN_PAYMENT 53.64 AMT_DOWN_PAYMENT 53.64 NAME_TYPE_SUITE 49.12 NFLAG_INSURED_ON_APPROVAL 40.30 DAYS_FIRST_DRAWING 40.30 DAYS_FIRST_DUE 40.30 DAYS_LAST_DUE_1ST_VERSION 40.30 DAYS_LAST_DUE 40.30 DAYS_TERMINATION 40.30 AMT_GOODS_PRICE 23.08 AMT_ANNUITY 22.29 CNT_PAYMENT 22.29 PRODUCT_COMBINATION 0.02 CHANNEL_TYPE 0.00 NAME_PRODUCT_TYPE 0.00 NAME_YIELD_GROUP 0.00 SELLERPLACE_AREA 0.00 NAME_SELLER_INDUSTRY 0.00 NAME_GOODS_CATEGORY 0.00 NAME_PORTFOLIO 0.00 SK_ID_PREV 0.00 NAME_CLIENT_TYPE 0.00 CODE_REJECT_REASON 0.00 SK_ID_CURR 0.00 DAYS_DECISION 0.00 NAME_CONTRACT_STATUS 0.00 NAME_CASH_LOAN_PURPOSE 0.00 NFLAG_LAST_APPL_IN_DAY 0.00 FLAG_LAST_APPL_PER_CONTRACT 0.00 HOUR_APPR_PROCESS_START 0.00 WEEKDAY_APPR_PROCESS_START 0.00 AMT_CREDIT 0.00 AMT_APPLICATION 0.00 NAME_CONTRACT_TYPE 0.00 NAME_PAYMENT_TYPE 0.00 dtype: float64
deleted_column.shape
(307511, 49)
deleted_column.columns
Index(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'], dtype='object')
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,2).sort_values(ascending=False)
OCCUPATION_TYPE 31.35 EXT_SOURCE_3 19.83 AMT_REQ_CREDIT_BUREAU_YEAR 13.50 AMT_REQ_CREDIT_BUREAU_QRT 13.50 AMT_REQ_CREDIT_BUREAU_MON 13.50 AMT_REQ_CREDIT_BUREAU_WEEK 13.50 AMT_REQ_CREDIT_BUREAU_DAY 13.50 AMT_REQ_CREDIT_BUREAU_HOUR 13.50 NAME_TYPE_SUITE 0.42 OBS_30_CNT_SOCIAL_CIRCLE 0.33 DEF_30_CNT_SOCIAL_CIRCLE 0.33 OBS_60_CNT_SOCIAL_CIRCLE 0.33 DEF_60_CNT_SOCIAL_CIRCLE 0.33 EXT_SOURCE_2 0.21 AMT_GOODS_PRICE 0.09 FLAG_DOCUMENT_7 0.00 DAYS_LAST_PHONE_CHANGE 0.00 FLAG_DOCUMENT_2 0.00 FLAG_DOCUMENT_3 0.00 FLAG_DOCUMENT_4 0.00 FLAG_DOCUMENT_5 0.00 FLAG_DOCUMENT_6 0.00 FLAG_DOCUMENT_11 0.00 FLAG_DOCUMENT_8 0.00 FLAG_DOCUMENT_9 0.00 FLAG_DOCUMENT_10 0.00 LIVE_CITY_NOT_WORK_CITY 0.00 FLAG_DOCUMENT_12 0.00 FLAG_DOCUMENT_13 0.00 FLAG_DOCUMENT_14 0.00 FLAG_DOCUMENT_15 0.00 FLAG_DOCUMENT_16 0.00 FLAG_DOCUMENT_17 0.00 FLAG_DOCUMENT_18 0.00 FLAG_DOCUMENT_19 0.00 FLAG_DOCUMENT_20 0.00 FLAG_DOCUMENT_21 0.00 ORGANIZATION_TYPE 0.00 SK_ID_CURR 0.00 REG_CITY_NOT_WORK_CITY 0.00 AMT_ANNUITY 0.00 DAYS_BIRTH 0.00 REGION_POPULATION_RELATIVE 0.00 NAME_HOUSING_TYPE 0.00 NAME_FAMILY_STATUS 0.00 NAME_EDUCATION_TYPE 0.00 NAME_INCOME_TYPE 0.00 AMT_CREDIT 0.00 TARGET 0.00 AMT_INCOME_TOTAL 0.00 CNT_CHILDREN 0.00 FLAG_OWN_REALTY 0.00 FLAG_OWN_CAR 0.00 CODE_GENDER 0.00 NAME_CONTRACT_TYPE 0.00 DAYS_EMPLOYED 0.00 DAYS_REGISTRATION 0.00 DAYS_ID_PUBLISH 0.00 FLAG_MOBIL 0.00 FLAG_EMP_PHONE 0.00 FLAG_WORK_PHONE 0.00 FLAG_CONT_MOBILE 0.00 FLAG_PHONE 0.00 FLAG_EMAIL 0.00 CNT_FAM_MEMBERS 0.00 REGION_RATING_CLIENT 0.00 REGION_RATING_CLIENT_W_CITY 0.00 WEEKDAY_APPR_PROCESS_START 0.00 HOUR_APPR_PROCESS_START 0.00 REG_REGION_NOT_LIVE_REGION 0.00 REG_REGION_NOT_WORK_REGION 0.00 LIVE_REGION_NOT_WORK_REGION 0.00 REG_CITY_NOT_LIVE_CITY 0.00 dtype: float64
application_data_up["OCCUPATION_TYPE"].unique()
array(['Laborers', 'Core staff', 'Accountants', 'Managers', nan,
'Drivers', 'Sales staff', 'Cleaning staff', 'Cooking staff',
'Private service staff', 'Medicine staff', 'Security staff',
'High skill tech staff', 'Waiters/barmen staff',
'Low-skill Laborers', 'Realty agents', 'Secretaries', 'IT staff',
'HR staff'], dtype=object)
application_data_up["OCCUPATION_TYPE"].value_counts()
Laborers 55186 Sales staff 32102 Core staff 27570 Managers 21371 Drivers 18603 High skill tech staff 11380 Accountants 9813 Medicine staff 8537 Security staff 6721 Cooking staff 5946 Cleaning staff 4653 Private service staff 2652 Low-skill Laborers 2093 Waiters/barmen staff 1348 Secretaries 1305 Realty agents 751 HR staff 563 IT staff 526 Name: OCCUPATION_TYPE, dtype: int64
application_data_up["OCCUPATION_TYPE"].isnull().sum()
96391
application_data_up["OCCUPATION_TYPE"].replace(np.NaN, "unknown", inplace=True)
application_data_up["OCCUPATION_TYPE"].isnull().sum()
0
application_data_up["OCCUPATION_TYPE"].value_counts()
unknown 96391 Laborers 55186 Sales staff 32102 Core staff 27570 Managers 21371 Drivers 18603 High skill tech staff 11380 Accountants 9813 Medicine staff 8537 Security staff 6721 Cooking staff 5946 Cleaning staff 4653 Private service staff 2652 Low-skill Laborers 2093 Waiters/barmen staff 1348 Secretaries 1305 Realty agents 751 HR staff 563 IT staff 526 Name: OCCUPATION_TYPE, dtype: int64
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,2).sort_values(ascending=False)
EXT_SOURCE_3 19.83 AMT_REQ_CREDIT_BUREAU_YEAR 13.50 AMT_REQ_CREDIT_BUREAU_QRT 13.50 AMT_REQ_CREDIT_BUREAU_MON 13.50 AMT_REQ_CREDIT_BUREAU_WEEK 13.50 AMT_REQ_CREDIT_BUREAU_DAY 13.50 AMT_REQ_CREDIT_BUREAU_HOUR 13.50 NAME_TYPE_SUITE 0.42 OBS_30_CNT_SOCIAL_CIRCLE 0.33 DEF_30_CNT_SOCIAL_CIRCLE 0.33 OBS_60_CNT_SOCIAL_CIRCLE 0.33 DEF_60_CNT_SOCIAL_CIRCLE 0.33 EXT_SOURCE_2 0.21 AMT_GOODS_PRICE 0.09 FLAG_DOCUMENT_8 0.00 DAYS_LAST_PHONE_CHANGE 0.00 FLAG_DOCUMENT_2 0.00 FLAG_DOCUMENT_3 0.00 FLAG_DOCUMENT_4 0.00 FLAG_DOCUMENT_5 0.00 FLAG_DOCUMENT_6 0.00 FLAG_DOCUMENT_7 0.00 FLAG_DOCUMENT_12 0.00 FLAG_DOCUMENT_9 0.00 FLAG_DOCUMENT_10 0.00 FLAG_DOCUMENT_11 0.00 LIVE_CITY_NOT_WORK_CITY 0.00 FLAG_DOCUMENT_13 0.00 FLAG_DOCUMENT_14 0.00 FLAG_DOCUMENT_15 0.00 FLAG_DOCUMENT_16 0.00 FLAG_DOCUMENT_17 0.00 FLAG_DOCUMENT_18 0.00 FLAG_DOCUMENT_19 0.00 FLAG_DOCUMENT_20 0.00 FLAG_DOCUMENT_21 0.00 ORGANIZATION_TYPE 0.00 SK_ID_CURR 0.00 REG_CITY_NOT_WORK_CITY 0.00 AMT_ANNUITY 0.00 DAYS_BIRTH 0.00 REGION_POPULATION_RELATIVE 0.00 NAME_HOUSING_TYPE 0.00 NAME_FAMILY_STATUS 0.00 NAME_EDUCATION_TYPE 0.00 NAME_INCOME_TYPE 0.00 AMT_CREDIT 0.00 DAYS_REGISTRATION 0.00 AMT_INCOME_TOTAL 0.00 CNT_CHILDREN 0.00 FLAG_OWN_REALTY 0.00 FLAG_OWN_CAR 0.00 CODE_GENDER 0.00 NAME_CONTRACT_TYPE 0.00 DAYS_EMPLOYED 0.00 DAYS_ID_PUBLISH 0.00 TARGET 0.00 REGION_RATING_CLIENT 0.00 LIVE_REGION_NOT_WORK_REGION 0.00 REG_REGION_NOT_WORK_REGION 0.00 REG_REGION_NOT_LIVE_REGION 0.00 HOUR_APPR_PROCESS_START 0.00 WEEKDAY_APPR_PROCESS_START 0.00 REGION_RATING_CLIENT_W_CITY 0.00 CNT_FAM_MEMBERS 0.00 FLAG_MOBIL 0.00 OCCUPATION_TYPE 0.00 FLAG_EMAIL 0.00 FLAG_PHONE 0.00 FLAG_CONT_MOBILE 0.00 FLAG_WORK_PHONE 0.00 FLAG_EMP_PHONE 0.00 REG_CITY_NOT_LIVE_CITY 0.00 dtype: float64
not_required=['FLAG_DOCUMENT_21','FLAG_DOCUMENT_20','FLAG_DOCUMENT_19','FLAG_DOCUMENT_18','FLAG_DOCUMENT_17','FLAG_DOCUMENT_16',
'FLAG_DOCUMENT_15','FLAG_DOCUMENT_14','FLAG_DOCUMENT_13','FLAG_DOCUMENT_12','FLAG_DOCUMENT_11',
'FLAG_DOCUMENT_10','FLAG_DOCUMENT_9','FLAG_DOCUMENT_8','FLAG_DOCUMENT_7','FLAG_DOCUMENT_6','FLAG_DOCUMENT_5',
'FLAG_DOCUMENT_4','FLAG_DOCUMENT_3','FLAG_DOCUMENT_2','OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE',
'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE','AMT_REQ_CREDIT_BUREAU_YEAR','AMT_REQ_CREDIT_BUREAU_MON',
'AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_QRT']
application_data_up.drop(labels=not_required,axis=1,inplace=True)
application_data_up.shape
(307511, 43)
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,2).sort_values(ascending=False)
EXT_SOURCE_3 19.83 NAME_TYPE_SUITE 0.42 EXT_SOURCE_2 0.21 AMT_GOODS_PRICE 0.09 HOUR_APPR_PROCESS_START 0.00 FLAG_PHONE 0.00 FLAG_EMAIL 0.00 OCCUPATION_TYPE 0.00 CNT_FAM_MEMBERS 0.00 REGION_RATING_CLIENT 0.00 REGION_RATING_CLIENT_W_CITY 0.00 WEEKDAY_APPR_PROCESS_START 0.00 SK_ID_CURR 0.00 REG_REGION_NOT_LIVE_REGION 0.00 FLAG_WORK_PHONE 0.00 REG_REGION_NOT_WORK_REGION 0.00 LIVE_REGION_NOT_WORK_REGION 0.00 REG_CITY_NOT_LIVE_CITY 0.00 REG_CITY_NOT_WORK_CITY 0.00 LIVE_CITY_NOT_WORK_CITY 0.00 ORGANIZATION_TYPE 0.00 FLAG_CONT_MOBILE 0.00 FLAG_MOBIL 0.00 FLAG_EMP_PHONE 0.00 TARGET 0.00 NAME_CONTRACT_TYPE 0.00 CODE_GENDER 0.00 FLAG_OWN_CAR 0.00 FLAG_OWN_REALTY 0.00 CNT_CHILDREN 0.00 AMT_INCOME_TOTAL 0.00 AMT_CREDIT 0.00 AMT_ANNUITY 0.00 NAME_INCOME_TYPE 0.00 NAME_EDUCATION_TYPE 0.00 NAME_FAMILY_STATUS 0.00 NAME_HOUSING_TYPE 0.00 REGION_POPULATION_RELATIVE 0.00 DAYS_BIRTH 0.00 DAYS_EMPLOYED 0.00 DAYS_REGISTRATION 0.00 DAYS_ID_PUBLISH 0.00 DAYS_LAST_PHONE_CHANGE 0.00 dtype: float64
plt.figure(figsize=[30,10])
sns.barplot(x=application_data_up.OCCUPATION_TYPE.value_counts().index,
y=application_data_up.OCCUPATION_TYPE.value_counts().values).set_title("OCCUPATION TYPE COUNTS",fontsize=40, color='blue',pad=30)
plt.xlabel('OCCUPATION TYPE', fontsize=40, color='blue')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=[20,15])
sns.set_style('darkgrid')
plt.subplot(2,2,1)
sns.boxplot(application_data_up['EXT_SOURCE_2']).set_title("EXT_SOURCE_2",fontsize=30,color='blue',pad=30)
plt.subplot(2,2,2)
sns.boxplot(application_data_up['EXT_SOURCE_3']).set_title("EXT_SOURCE_3",fontsize=30,color='blue',pad=30)
plt.subplot(2,2,3)
sns.distplot(application_data_up['EXT_SOURCE_2'],color='b')
plt.subplot(2,2,4)
sns.distplot(application_data_up['EXT_SOURCE_3'],color='b')
plt.show()
#REPLACING THE MISSING VALUES WITH ITS CORRESPONDING MEDIAN
for column in ['EXT_SOURCE_2','EXT_SOURCE_3']:
application_data_up[column].fillna(application_data_up[column].median(), inplace=True)
# CHECKING THE CORRELATION BETWEEN THE LOAN AMOUNT DEMANDED VS THE GOOD'S PRICE
sns.scatterplot(x=application_data_up['AMT_CREDIT'], y=application_data_up['AMT_GOODS_PRICE'], data=application_data_up)
plt.title("Correlation between the loan amount and the price of goods for which loan was given\n",
fontdict={'fontsize':35,'fontweight':10,'color':'Blue'})
plt.xlabel("Loan Amount",fontdict={'fontsize':35,'fontweight':10,'color':'Black'})
plt.ylabel("Price of Goods",fontdict={'fontsize':35,'fontweight':10,'color':'Black'})
plt.show()
# Imputing the above mentioned logic
application_data_up['AMT_GOODS_PRICE']=np.where(application_data_up['AMT_GOODS_PRICE'].isnull()==True,
application_data_up['AMT_CREDIT'],application_data_up['AMT_GOODS_PRICE'])
application_data_up['AMT_GOODS_PRICE'].isnull().sum()
0
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
#Percentage of each category
go.Figure(data=[go.Pie(labels=application_data_up.NAME_TYPE_SUITE.value_counts().index,
values=application_data_up.NAME_TYPE_SUITE.value_counts().values, hole=.5, title="NAME_TYPE_VALUE COUNTS",
pull=[0,0.1,0.1,0.1,0.1,0.1,0.1])])
application_data_up['NAME_TYPE_SUITE'].isnull().sum()/application_data_up.shape[0]*100
0.42014757195677555
application_data_up['NAME_TYPE_SUITE'].isnull().sum()
1292
application_data_up['NAME_TYPE_SUITE'].mode()
0 Unaccompanied Name: NAME_TYPE_SUITE, dtype: object
#Replacing missing values with MODE
application_data_up['NAME_TYPE_SUITE'].fillna(application_data_up['NAME_TYPE_SUITE'].mode()[0],inplace=True)
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,4).sort_values(ascending=False)
AMT_ANNUITY 0.0039 CNT_FAM_MEMBERS 0.0007 DAYS_LAST_PHONE_CHANGE 0.0003 HOUR_APPR_PROCESS_START 0.0000 FLAG_CONT_MOBILE 0.0000 FLAG_PHONE 0.0000 FLAG_EMAIL 0.0000 OCCUPATION_TYPE 0.0000 REGION_RATING_CLIENT 0.0000 REGION_RATING_CLIENT_W_CITY 0.0000 WEEKDAY_APPR_PROCESS_START 0.0000 REG_REGION_NOT_LIVE_REGION 0.0000 FLAG_EMP_PHONE 0.0000 REG_REGION_NOT_WORK_REGION 0.0000 LIVE_REGION_NOT_WORK_REGION 0.0000 REG_CITY_NOT_LIVE_CITY 0.0000 REG_CITY_NOT_WORK_CITY 0.0000 LIVE_CITY_NOT_WORK_CITY 0.0000 ORGANIZATION_TYPE 0.0000 EXT_SOURCE_2 0.0000 EXT_SOURCE_3 0.0000 FLAG_WORK_PHONE 0.0000 SK_ID_CURR 0.0000 TARGET 0.0000 DAYS_ID_PUBLISH 0.0000 NAME_CONTRACT_TYPE 0.0000 CODE_GENDER 0.0000 FLAG_OWN_CAR 0.0000 FLAG_OWN_REALTY 0.0000 CNT_CHILDREN 0.0000 AMT_INCOME_TOTAL 0.0000 AMT_CREDIT 0.0000 AMT_GOODS_PRICE 0.0000 NAME_TYPE_SUITE 0.0000 NAME_INCOME_TYPE 0.0000 NAME_EDUCATION_TYPE 0.0000 NAME_FAMILY_STATUS 0.0000 NAME_HOUSING_TYPE 0.0000 REGION_POPULATION_RELATIVE 0.0000 DAYS_BIRTH 0.0000 DAYS_EMPLOYED 0.0000 DAYS_REGISTRATION 0.0000 FLAG_MOBIL 0.0000 dtype: float64
#REMAINING COLUMNS WITH NEGLIGIBLE NULL VALUES (LESS THAN 1%)
null_col=['CNT_FAM_MEMBERS','AMT_ANNUITY', 'DAYS_LAST_PHONE_CHANGE']
for column in null_col:
application_data_up[column].fillna(application_data_up[column].median(), inplace=True)
round(application_data_up.isnull().sum()/application_data_up.shape[0]*100,4).sort_values(ascending=False)
SK_ID_CURR 0.0 HOUR_APPR_PROCESS_START 0.0 FLAG_CONT_MOBILE 0.0 FLAG_PHONE 0.0 FLAG_EMAIL 0.0 OCCUPATION_TYPE 0.0 CNT_FAM_MEMBERS 0.0 REGION_RATING_CLIENT 0.0 REGION_RATING_CLIENT_W_CITY 0.0 WEEKDAY_APPR_PROCESS_START 0.0 REG_REGION_NOT_LIVE_REGION 0.0 FLAG_EMP_PHONE 0.0 REG_REGION_NOT_WORK_REGION 0.0 LIVE_REGION_NOT_WORK_REGION 0.0 REG_CITY_NOT_LIVE_CITY 0.0 REG_CITY_NOT_WORK_CITY 0.0 LIVE_CITY_NOT_WORK_CITY 0.0 ORGANIZATION_TYPE 0.0 EXT_SOURCE_2 0.0 EXT_SOURCE_3 0.0 FLAG_WORK_PHONE 0.0 FLAG_MOBIL 0.0 TARGET 0.0 AMT_GOODS_PRICE 0.0 NAME_CONTRACT_TYPE 0.0 CODE_GENDER 0.0 FLAG_OWN_CAR 0.0 FLAG_OWN_REALTY 0.0 CNT_CHILDREN 0.0 AMT_INCOME_TOTAL 0.0 AMT_CREDIT 0.0 AMT_ANNUITY 0.0 NAME_TYPE_SUITE 0.0 DAYS_ID_PUBLISH 0.0 NAME_INCOME_TYPE 0.0 NAME_EDUCATION_TYPE 0.0 NAME_FAMILY_STATUS 0.0 NAME_HOUSING_TYPE 0.0 REGION_POPULATION_RELATIVE 0.0 DAYS_BIRTH 0.0 DAYS_EMPLOYED 0.0 DAYS_REGISTRATION 0.0 DAYS_LAST_PHONE_CHANGE 0.0 dtype: float64
application_data_up.info(verbose=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307511 non-null float64 10 AMT_GOODS_PRICE 307511 non-null float64 11 NAME_TYPE_SUITE 307511 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null float64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 FLAG_MOBIL 307511 non-null int64 22 FLAG_EMP_PHONE 307511 non-null int64 23 FLAG_WORK_PHONE 307511 non-null int64 24 FLAG_CONT_MOBILE 307511 non-null int64 25 FLAG_PHONE 307511 non-null int64 26 FLAG_EMAIL 307511 non-null int64 27 OCCUPATION_TYPE 307511 non-null object 28 CNT_FAM_MEMBERS 307511 non-null float64 29 REGION_RATING_CLIENT 307511 non-null int64 30 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 31 WEEKDAY_APPR_PROCESS_START 307511 non-null object 32 HOUR_APPR_PROCESS_START 307511 non-null int64 33 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 34 REG_REGION_NOT_WORK_REGION 307511 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 36 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 37 REG_CITY_NOT_WORK_CITY 307511 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 39 ORGANIZATION_TYPE 307511 non-null object 40 EXT_SOURCE_2 307511 non-null float64 41 EXT_SOURCE_3 307511 non-null float64 42 DAYS_LAST_PHONE_CHANGE 307511 non-null float64 dtypes: float64(10), int64(21), object(12) memory usage: 100.9+ MB
dayandcount=['CNT_FAM_MEMBERS','DAYS_REGISTRATION','DAYS_LAST_PHONE_CHANGE']
application_data_up.loc[:, dayandcount]=application_data_up.loc[:,dayandcount].apply(lambda x: x.astype('int64',errors='ignore'))
application_data_up.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307511 non-null float64 10 AMT_GOODS_PRICE 307511 non-null float64 11 NAME_TYPE_SUITE 307511 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null int64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 FLAG_MOBIL 307511 non-null int64 22 FLAG_EMP_PHONE 307511 non-null int64 23 FLAG_WORK_PHONE 307511 non-null int64 24 FLAG_CONT_MOBILE 307511 non-null int64 25 FLAG_PHONE 307511 non-null int64 26 FLAG_EMAIL 307511 non-null int64 27 OCCUPATION_TYPE 307511 non-null object 28 CNT_FAM_MEMBERS 307511 non-null int64 29 REGION_RATING_CLIENT 307511 non-null int64 30 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 31 WEEKDAY_APPR_PROCESS_START 307511 non-null object 32 HOUR_APPR_PROCESS_START 307511 non-null int64 33 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 34 REG_REGION_NOT_WORK_REGION 307511 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 36 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 37 REG_CITY_NOT_WORK_CITY 307511 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 39 ORGANIZATION_TYPE 307511 non-null object 40 EXT_SOURCE_2 307511 non-null float64 41 EXT_SOURCE_3 307511 non-null float64 42 DAYS_LAST_PHONE_CHANGE 307511 non-null int64 dtypes: float64(7), int64(24), object(12) memory usage: 100.9+ MB
#LISTING OBJECT TYPE COLUMNS AND CONFIRMING THE VALUES TO BE IN STRING TYPE
obj_col=list(application_data_up.select_dtypes(include='object').columns)
application_data_up.loc[:,obj_col]=application_data_up.loc[:,obj_col].apply(lambda x: x.astype('str'))
application_data_up.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307511 non-null float64 10 AMT_GOODS_PRICE 307511 non-null float64 11 NAME_TYPE_SUITE 307511 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null int64 19 DAYS_REGISTRATION 307511 non-null int64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 FLAG_MOBIL 307511 non-null int64 22 FLAG_EMP_PHONE 307511 non-null int64 23 FLAG_WORK_PHONE 307511 non-null int64 24 FLAG_CONT_MOBILE 307511 non-null int64 25 FLAG_PHONE 307511 non-null int64 26 FLAG_EMAIL 307511 non-null int64 27 OCCUPATION_TYPE 307511 non-null object 28 CNT_FAM_MEMBERS 307511 non-null int64 29 REGION_RATING_CLIENT 307511 non-null int64 30 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 31 WEEKDAY_APPR_PROCESS_START 307511 non-null object 32 HOUR_APPR_PROCESS_START 307511 non-null int64 33 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 34 REG_REGION_NOT_WORK_REGION 307511 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 36 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 37 REG_CITY_NOT_WORK_CITY 307511 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 39 ORGANIZATION_TYPE 307511 non-null object 40 EXT_SOURCE_2 307511 non-null float64 41 EXT_SOURCE_3 307511 non-null float64 42 DAYS_LAST_PHONE_CHANGE 307511 non-null int64 dtypes: float64(7), int64(24), object(12) memory usage: 100.9+ MB
CHECKING VALUES OF OTHER CATEGORICAL COLUMNS
CHECKING GENDER CODE COLUMN
# VALUE COUNTS OF GENDER CODE
application_data_up.CODE_GENDER.value_counts()
F 202448 M 105059 XNA 4 Name: CODE_GENDER, dtype: int64
#CHECKING GENDER COLUMN
plt.figure(figsize=[20,10])
sns.barplot(x=application_data_up.CODE_GENDER, y=application_data_up.TARGET).set_title("GENDER VS TARGET",fontsize=30,
color='Blue', pad=25)
plt.show()
round(application_data_up.ORGANIZATION_TYPE.value_counts()/application_data_up.shape[0]*100,2)
Business Entity Type 3 22.11 XNA 18.01 Self-employed 12.49 Other 5.43 Medicine 3.64 Business Entity Type 2 3.43 Government 3.38 School 2.89 Trade: type 7 2.55 Kindergarten 2.24 Construction 2.19 Business Entity Type 1 1.95 Transport: type 4 1.76 Trade: type 3 1.14 Industry: type 9 1.10 Industry: type 3 1.07 Security 1.06 Housing 0.96 Industry: type 11 0.88 Military 0.86 Bank 0.82 Agriculture 0.80 Police 0.76 Transport: type 2 0.72 Postal 0.70 Security Ministries 0.64 Trade: type 2 0.62 Restaurant 0.59 Services 0.51 University 0.43 Industry: type 7 0.43 Transport: type 3 0.39 Industry: type 1 0.34 Hotel 0.31 Electricity 0.31 Industry: type 4 0.29 Trade: type 6 0.21 Industry: type 5 0.19 Insurance 0.19 Telecom 0.19 Emergency 0.18 Industry: type 2 0.15 Advertising 0.14 Realtor 0.13 Culture 0.12 Industry: type 12 0.12 Trade: type 1 0.11 Mobile 0.10 Legal Services 0.10 Cleaning 0.08 Transport: type 1 0.07 Industry: type 6 0.04 Industry: type 10 0.04 Religion 0.03 Industry: type 13 0.02 Trade: type 4 0.02 Trade: type 5 0.02 Industry: type 8 0.01 Name: ORGANIZATION_TYPE, dtype: float64
#Therefore, we eliminate the sub-category with the overall category
application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Industry' if 'Industry' in x else x)
application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if 'Trade' in x else x)
application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Transport' if 'Transport' in x else x)
application_data_up.ORGANIZATION_TYPE = application_data_up.ORGANIZATION_TYPE.apply(lambda x: 'Business' if 'Business' in x else x)
round(application_data_up.ORGANIZATION_TYPE.value_counts()/application_data_up.shape[0]*100,2)
Business 27.49 XNA 18.01 Self-employed 12.49 Other 5.43 Trade 4.66 Industry 4.65 Medicine 3.64 Government 3.38 Transport 2.92 School 2.89 Kindergarten 2.24 Construction 2.19 Security 1.06 Housing 0.96 Military 0.86 Bank 0.82 Agriculture 0.80 Police 0.76 Postal 0.70 Security Ministries 0.64 Restaurant 0.59 Services 0.51 University 0.43 Hotel 0.31 Electricity 0.31 Insurance 0.19 Telecom 0.19 Emergency 0.18 Advertising 0.14 Realtor 0.13 Culture 0.12 Mobile 0.10 Legal Services 0.10 Cleaning 0.08 Religion 0.03 Name: ORGANIZATION_TYPE, dtype: float64
plt.figure(figsize=[30,10])
sns.barplot(x=application_data_up.ORGANIZATION_TYPE.value_counts().index,
y=application_data_up.ORGANIZATION_TYPE.value_counts().values).set_title("Distribution within ORGANIZATION_TYPE",
fontsize=20,color='Blue',pad=20)
plt.xlabel('ORGANIZATION_TYPE',fontsize=18,color='Black')
plt.xticks(rotation = 90)
plt.show()
application_data_up.NAME_CONTRACT_TYPE.value_counts()
Cash loans 278232 Revolving loans 29279 Name: NAME_CONTRACT_TYPE, dtype: int64
application_data_up.FLAG_OWN_CAR.value_counts()
N 202924 Y 104587 Name: FLAG_OWN_CAR, dtype: int64
application_data_up.FLAG_OWN_REALTY.value_counts()
Y 213312 N 94199 Name: FLAG_OWN_REALTY, dtype: int64
application_data_up.NAME_TYPE_SUITE.value_counts()
Unaccompanied 249818 Family 40149 Spouse, partner 11370 Children 3267 Other_B 1770 Other_A 866 Group of people 271 Name: NAME_TYPE_SUITE, dtype: int64
application_data_up.NAME_EDUCATION_TYPE.value_counts()
Secondary / secondary special 218391 Higher education 74863 Incomplete higher 10277 Lower secondary 3816 Academic degree 164 Name: NAME_EDUCATION_TYPE, dtype: int64
application_data_up.NAME_INCOME_TYPE.value_counts()
Working 158774 Commercial associate 71617 Pensioner 55362 State servant 21703 Unemployed 22 Student 18 Businessman 10 Maternity leave 5 Name: NAME_INCOME_TYPE, dtype: int64
application_data_up.NAME_FAMILY_STATUS.value_counts()
Married 196432 Single / not married 45444 Civil marriage 29775 Separated 19770 Widow 16088 Unknown 2 Name: NAME_FAMILY_STATUS, dtype: int64
application_data_up.NAME_HOUSING_TYPE.value_counts()
House / apartment 272868 With parents 14840 Municipal apartment 11183 Rented apartment 4881 Office apartment 2617 Co-op apartment 1122 Name: NAME_HOUSING_TYPE, dtype: int64
application_data_up.WEEKDAY_APPR_PROCESS_START.value_counts()
TUESDAY 53901 WEDNESDAY 51934 MONDAY 50714 THURSDAY 50591 FRIDAY 50338 SATURDAY 33852 SUNDAY 16181 Name: WEEKDAY_APPR_PROCESS_START, dtype: int64
application_data_up.DAYS_BIRTH.unique()
array([ -9461, -16765, -19046, ..., -7951, -7857, -25061], dtype=int64)
application_data_up.DAYS_EMPLOYED.unique()
array([ -637, -1188, -225, ..., -12971, -11084, -8694], dtype=int64)
#CHECKING 'DAYS_REGISTRATION' COLUMN
application_data_up.DAYS_REGISTRATION.unique()
array([ -3648, -1186, -4260, ..., -16396, -14558, -14798], dtype=int64)
#CHECKING 'DAYS_ID_PUBLISH' COLUMN
application_data_up.DAYS_ID_PUBLISH.unique()
array([-2120, -291, -2531, ..., -6194, -5854, -6211], dtype=int64)
#CHECKING 'DAYS_LAST_PHONE_CHANGE' COLUMN
application_data_up.DAYS_LAST_PHONE_CHANGE.unique()
array([-1134, -828, -815, ..., -3988, -3899, -3538], dtype=int64)
(application_data_up["DAYS_EMPLOYED"]//365).value_counts().tail()
-49 4 -47 4 0 2 -48 1 -50 1 Name: DAYS_EMPLOYED, dtype: int64
#changing values to positive integer
num_days=['DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH','DAYS_LAST_PHONE_CHANGE']
for i in num_days:
application_data_up[i]=abs(application_data_up[i])
application_data_up.DAYS_LAST_PHONE_CHANGE.unique()
array([1134, 828, 815, ..., 3988, 3899, 3538], dtype=int64)
application_data_up.AMT_INCOME_TOTAL.quantile([0,0.1,0.15,0.2,0.25,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9,0.95,0.99,0.999,1])
0.000 25650.0 0.100 81000.0 0.150 90000.0 0.200 99000.0 0.250 112500.0 0.300 112500.0 0.400 135000.0 0.500 147150.0 0.600 162000.0 0.700 180000.0 0.750 202500.0 0.800 225000.0 0.900 270000.0 0.950 337500.0 0.990 472500.0 0.999 900000.0 1.000 117000000.0 Name: AMT_INCOME_TOTAL, dtype: float64
application_data_up['INCOME_SLAB']=pd.qcut(application_data_up['AMT_INCOME_TOTAL'], q=[0,0.2,0.5,0.75,0.95,1],
labels=['VeryLow','Low','Medium','High','VeryHigh'])
application_data_up['INCOME_SLAB'].value_counts()
Low 90089 Medium 82213 VeryLow 63671 High 57503 VeryHigh 14035 Name: INCOME_SLAB, dtype: int64
application_data_up['AMT_CREDIT_slab']=pd.qcut(application_data_up['AMT_CREDIT'], q=[0,0.2,0.5,0.75,0.95,1],
labels=['VeryLow','Low','Medium','High','VeryHigh'])
application_data_up['AMT_CREDIT_slab'].value_counts()
Low 88924 Medium 77786 VeryLow 64925 High 61842 VeryHigh 14034 Name: AMT_CREDIT_slab, dtype: int64
#BINNING OF DAYS_BIRTH / AGE COLUMN
#CONVERTING DAYS TO YEARS
application_data_up['AGE']=(application_data_up['DAYS_BIRTH']//365).astype('int64',errors='ignore')
#MAX & MIN VALUES
print(application_data_up['AGE'].max())
print(application_data_up['AGE'].min())
69 20
#CREATING 10 BINS
application_data_up['AGE_BINS']=pd.cut(application_data_up['AGE'],bins=np.arange(20,71,5))
application_data_up['AGE_BINS'].value_counts()
(35, 40] 43680 (40, 45] 39997 (30, 35] 39437 (25, 30] 36488 (50, 55] 35097 (45, 50] 34404 (55, 60] 32722 (60, 65] 24359 (20, 25] 16317 (65, 70] 5009 Name: AGE_BINS, dtype: int64
OUTLIER ANALYSIS
#CHECKING AMT_INCOME_TOTAL_COLUMN
application_data_up.AMT_INCOME_TOTAL.quantile([0.9991,0.9992,0.9993])
0.9991 9.025105e+05 0.9992 1.034820e+06 0.9993 1.125000e+06 Name: AMT_INCOME_TOTAL, dtype: float64
#Handling outliers for the AMT_INCOME_TOTAL by capping the income to 10Lakh: - Coping with threshold value
application_data_up['AMT_INCOME_TOTAL']= np.where(application_data_up['AMT_INCOME_TOTAL']
>1000000,1000000,application_data_up['AMT_INCOME_TOTAL'])
application_data_up['AMT_INCOME_TOTAL'].value_counts()
135000.0 35750
112500.0 31019
157500.0 26556
180000.0 24719
90000.0 22483
...
117324.0 1
64584.0 1
142897.5 1
109170.0 1
113062.5 1
Name: AMT_INCOME_TOTAL, Length: 2499, dtype: int64
application_data_up['AMT_INCOME_TOTAL'].max()
1000000.0
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOX PLOT",fontsize=30,color='blue',pad=30)
plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_INCOME_TOTAL'],color='blue').set_title("AMT_INCOME_TOTAL - DISTRIBUTION",
fontsize=30,color='blue',pad=30)
plt.show()
application_data_up.shape
(307511, 47)
application_data_up.drop('AMT-INCOME_TOTAL',axis=1,inplace=True)
application_data_up.shape
(307511, 47)
Q3=np.quantile(application_data_up['AMT_INCOME_TOTAL'],0.75)
Q1=np.quantile(application_data_up['AMT_INCOME_TOTAL'],0.25)
IQR= Q3-Q1
lower_range=Q1-1.5*IQR
upper_range=Q3+1.5*IQR
#outliers=round(application_data_up.loc[(application_data_up['AMT_INCOME_TOTAL']> upper_range)].count()/new_credit.shape[0]*100)
#outliers
application_data_up.loc[(application_data_up['AMT_INCOME_TOTAL']>upper_range)].shape
(14035, 47)
def remov_outliers(data,col):
lower_range=0
upper_range=0
Q3=np.quantile(data[col],0.75)
Q1=np.quantile(data[col],0.25)
IQR=Q3-Q1
lower_range=Q1-1.5*IQR
upper_range=Q3-1.5*IQR
return upper_range,lower_range
#impute outliers with median
application_data_up['AMT_INCOME_TOTAL']=np.where(application_data_up['AMT_INCOME_TOTAL']
>upper_range,application_data_up['AMT_INCOME_TOTAL'].median(),application_data_up['AMT_INCOME_TOTAL'])
application_data_up.shape
(307511, 47)
sns.boxplot(application_data_up['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOX PLOT", fontsize=25, color='blue', pad=20)
plt.show()
application_data_up.shape
(307511, 47)
plt.figure(figsize=[30,10])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_INCOME_TOTAL']).set_title("AMT_INCOME_TOTAL - BOX PLOT", fontsize=25, color='blue',pad=30)
plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_INCOME_TOTAL'],color='blue').set_title("AMT_INCOME_TOTAL - DISTRIBUTION",
fontsize=25, color='blue', pad=25)
plt.show()
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_ANNUITY']).set_title("AMT_ANNUITY - BOX PLOT",fontsize=25,color='blue',pad=20)
plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_ANNUITY'],color='blue').set_title("AMT_ANNUITY - DISTRIBUTION",
fontsize=25,color='indigo',pad=20)
plt.show()
application_data_up.shape
(307511, 47)
plt.figure(figsize=[30,10])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['AMT_ANNUITY']).set_title("AMT_ANNUITY - BOX PLOT",fontsize=15,color='blue',pad=20)
plt.subplot(1,2,2)
sns.distplot(application_data_up['AMT_ANNUITY'],color='blue').set_title("AMT_ANNUITY - DISTRIBUTION",
fontsize=15,color='blue',pad=20)
plt.show()
plt.figure(figsize=[30,10])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['AGE']).set_title("AGE - BOXPLOT",fontsize=25,color='blue',pad=30)
plt.subplot(1,2,2)
sns.distplot(application_data_up['AGE'],color='blue').set_title("AGE - DISTRIBUTION",
fontsize=25,color='blue',pad=30)
plt.show()
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['DAYS_EMPLOYED']).set_title("DAYS_EMPLOYED - BOX PLOT",fontsize=25,color='blue',pad=30)
plt.subplot(1,2,2)
sns.distplot(application_data_up['DAYS_EMPLOYED'],color='blue').set_title("DAYS_EMPLOYED - DISTRIBUTION",
fontsize=25,color='blue',pad=30)
plt.show()
application_data_up['DAYS_EMPLOYED']=np.where(application_data_up['DAYS_EMPLOYED']
>=100000,application_data_up['DAYS_EMPLOYED'].median(),application_data_up['DAYS_EMPLOYED'])
application_data_up['DAYS_EMPLOYED']=np.where(application_data_up['DAYS_EMPLOYED']//365>=80,application_data_up['DAYS_EMPLOYED'].median(),application_data_up['DAYS_EMPLOYED'])
application_data_up.shape
(307511, 47)
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['DAYS_EMPLOYED']).set_title("DAYS_EMPLOYED - BOX PLOT",fontsize=25,color='blue',pad=30)
plt.subplot(1,2,2)
sns.distplot(application_data_up['DAYS_EMPLOYED'],color='blue').set_title("DAYS_EMPLOYED - DISTRIBUTION",
fontsize=25,color='blue',pad=30)
plt.show()
#CHECKING DAYS_REGISTRATION COLUMN
plt.figure(figsize=[20,5])
plt.subplot(1,2,1)
sns.boxplot(application_data_up['DAYS_REGISTRATION']).set_title("DAYS_REGISTRATION - BOX PLOT",fontsize=25,color='blue',pad=30)
plt.subplot(1,2,2)
sns.distplot(application_data_up['DAYS_REGISTRATION'],color='blue').set_title("DAYS_REGISTRATION - DISTRIBUTION",
fontsize=25,color='blue',pad=30)
plt.show()
#CREATING ADDITIONAL COLUMNS FOR ANALYSIS
#CREATING A COLUMN WITH VALUES CREDIT TO INCOME RATIO - Derived Metrics
application_data_up['CREDIT_RATIO']=(application_data_up.AMT_CREDIT // application_data_up.AMT_INCOME_TOTAL).astype('int64')
application_data_up.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 307511 entries, 0 to 307510 Data columns (total 48 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 307511 non-null int64 1 TARGET 307511 non-null int64 2 NAME_CONTRACT_TYPE 307511 non-null object 3 CODE_GENDER 307511 non-null object 4 FLAG_OWN_CAR 307511 non-null object 5 FLAG_OWN_REALTY 307511 non-null object 6 CNT_CHILDREN 307511 non-null int64 7 AMT_INCOME_TOTAL 307511 non-null float64 8 AMT_CREDIT 307511 non-null float64 9 AMT_ANNUITY 307511 non-null float64 10 AMT_GOODS_PRICE 307511 non-null float64 11 NAME_TYPE_SUITE 307511 non-null object 12 NAME_INCOME_TYPE 307511 non-null object 13 NAME_EDUCATION_TYPE 307511 non-null object 14 NAME_FAMILY_STATUS 307511 non-null object 15 NAME_HOUSING_TYPE 307511 non-null object 16 REGION_POPULATION_RELATIVE 307511 non-null float64 17 DAYS_BIRTH 307511 non-null int64 18 DAYS_EMPLOYED 307511 non-null float64 19 DAYS_REGISTRATION 307511 non-null int64 20 DAYS_ID_PUBLISH 307511 non-null int64 21 FLAG_MOBIL 307511 non-null int64 22 FLAG_EMP_PHONE 307511 non-null int64 23 FLAG_WORK_PHONE 307511 non-null int64 24 FLAG_CONT_MOBILE 307511 non-null int64 25 FLAG_PHONE 307511 non-null int64 26 FLAG_EMAIL 307511 non-null int64 27 OCCUPATION_TYPE 307511 non-null object 28 CNT_FAM_MEMBERS 307511 non-null int64 29 REGION_RATING_CLIENT 307511 non-null int64 30 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 31 WEEKDAY_APPR_PROCESS_START 307511 non-null object 32 HOUR_APPR_PROCESS_START 307511 non-null int64 33 REG_REGION_NOT_LIVE_REGION 307511 non-null int64 34 REG_REGION_NOT_WORK_REGION 307511 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 307511 non-null int64 36 REG_CITY_NOT_LIVE_CITY 307511 non-null int64 37 REG_CITY_NOT_WORK_CITY 307511 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 307511 non-null int64 39 ORGANIZATION_TYPE 307511 non-null object 40 EXT_SOURCE_2 307511 non-null float64 41 EXT_SOURCE_3 307511 non-null float64 42 DAYS_LAST_PHONE_CHANGE 307511 non-null int64 43 INCOME_SLAB 307511 non-null category 44 AMT_CREDIT_slab 307511 non-null category 45 AGE 307511 non-null int64 46 AGE_BINS 307510 non-null category 47 CREDIT_RATIO 307511 non-null int64 dtypes: category(3), float64(8), int64(25), object(12) memory usage: 106.5+ MB
##CHECKING TARGET IMBALANCE
go.Figure(data=[go.Pie(labels=application_data_up.TARGET.value_counts(normalize=True).index,
values=application_data_up.TARGET.value_counts(normalize=True).values, hole=.5,
title='Defaulter Vs Non-Defaulters')])
##CHECKING GENDER IMBALANCE
go.Figure(data=[go.Pie(labels=application_data_up.CODE_GENDER.value_counts(normalize=True).index,
values=application_data_up.CODE_GENDER.value_counts(normalize=True).values, hole=.5,
title='Male Vs Female')])
plt.figure(figsize=[12,12])
f=sns.heatmap(application_data_up[['AMT_CREDIT','AMT_GOODS_PRICE','AMT_ANNUITY','AMT_INCOME_TOTAL','AGE','DAYS_BIRTH',
'DAYS_EMPLOYED','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY']].corr(),
cmap="Blues",annot=True).set_title('Correlation between variables',fontsize=20,color='blue')
plt.show()
plt.figure(figsize=[12,12])
f=sns.heatmap(application_data_up[['AGE','FLAG_EMP_PHONE','DAYS_REGISTRATION','DAYS_ID_PUBLISH','REGION_RATING_CLIENT',
'REGION_RATING_CLIENT_W_CITY','REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY',
'LIVE_CITY_NOT_WORK_CITY']].corr(),
cmap="Oranges",annot=True).set_title('Correlation between variables',fontsize=15,color='NAvy')
plt.show()
application_data_up['REG_CITY_NOT_LIVE_CITY'].value_counts()
0 283472 1 24039 Name: REG_CITY_NOT_LIVE_CITY, dtype: int64
application_data_up['AGE_BINS'].value_counts()
(35, 40] 43680 (40, 45] 39997 (30, 35] 39437 (25, 30] 36488 (50, 55] 35097 (45, 50] 34404 (55, 60] 32722 (60, 65] 24359 (20, 25] 16317 (65, 70] 5009 Name: AGE_BINS, dtype: int64
#Creating two datasets for target=1 and target=0 (1=bad,0=good)
target1=application_data_up[application_data_up['TARGET']==1]
target0=application_data_up[application_data_up['TARGET']==0]
print(target1.shape,target0.shape,application_data_up.shape)
(24825, 48) (282686, 48) (307511, 48)
print("The dataset with Target value 1 has :" + "{:.2%}".format(target1.shape[0]/application_data_up.shape[0])+ "data.")
print("The dataset with Target value 0 has :" + "{:.2%}".format(target0.shape[0]/application_data_up.shape[0])+ "data.")
The dataset with Target value 1 has :8.07%data. The dataset with Target value 0 has :91.93%data.
#CHECKING THE GENDER COLUMNS
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
sns.barplot(x=target1.CODE_GENDER.value_counts(normalize=True).index,
y=target1.CODE_GENDER.value_counts(normalize=True).values,
palette="rocket").set_title("Gender(Defulter)%" , fontsize=30, color="blue")
plt.subplot(1,2,2)
sns.barplot(x=target0.CODE_GENDER.value_counts(normalize=True).index,
y=target0.CODE_GENDER.value_counts(normalize=True).values,
palette="rocket").set_title("Gender(Non-Defulter)%" , fontsize=30, color="blue")
plt.show()
#checking NAME_CONTACT_TYPE
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.NAME_CONTRACT_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_CONTRACT_TYPE(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_CONTRACT_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_CONTRACT_TYPE(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#Owing car vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.FLAG_OWN_CAR.value_counts(normalize=True).plot.barh(color="blue").set_title("FLAG_OWN_CAR(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.FLAG_OWN_CAR.value_counts(normalize=True).plot.barh(color="red").set_title("FLAG_OWN_CAR(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#Owing a House/Estate etc vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.FLAG_OWN_REALTY.value_counts(normalize=True).plot.barh(color="blue").set_title("FLAG_OWN_REALTY(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.FLAG_OWN_REALTY.value_counts(normalize=True).plot.barh(color="red").set_title("FLAG_OWN_REALTY(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#NAME_TYPE_SUITE vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.NAME_TYPE_SUITE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_TYPE_SUITE(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_TYPE_SUITE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_TYPE_SUITE(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#NAME_INCOME_TYPE vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.NAME_INCOME_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_INCOME_TYPE(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_INCOME_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_INCOME_TYPE(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#Education vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.NAME_EDUCATION_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_EDUCATION_TYPE(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_EDUCATION_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_EDUCATION_TYPE(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#Housing vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.NAME_HOUSING_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_HOUSING_TYPE(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_HOUSING_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_HOUSING_TYPE(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#Marital status vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.NAME_FAMILY_STATUS.value_counts(normalize=True).plot.barh(color="blue").set_title("NAME_FAMILY_STATUS(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.NAME_FAMILY_STATUS.value_counts(normalize=True).plot.barh(color="red").set_title("NAME_FAMILY_STATUS(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#Occupations vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.OCCUPATION_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("OCCUPATION_TYPE(DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.subplot(1,2,2)
target0.OCCUPATION_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("OCCUPATION_TYPE(NON-DEFAULTER)%",
fontsize=20,color="black", pad=30)
plt.show()
#Day of the week vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.WEEKDAY_APPR_PROCESS_START.value_counts(normalize=True).plot.barh(color="blue").set_title("WEEKDAY_APPR_PROCESS_START(DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.WEEKDAY_APPR_PROCESS_START.value_counts(normalize=True).plot.barh(color="red").set_title("WEEKDAY_APPR_PROCESS_START(NON-DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.show()
#Organization type vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.ORGANIZATION_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("ORGANIZATION_TYPE(DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.ORGANIZATION_TYPE.value_counts(normalize=True).plot.barh(color="red").set_title("ORGANIZATION_TYPE(NON-DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.show()
#Income vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.INCOME_SLAB.value_counts(normalize=True).plot.barh(color="blue").set_title("INCOME_SLAB(DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.INCOME_SLAB.value_counts(normalize=True).plot.barh(color="red").set_title("INCOME_SLAB(NON-DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.show()
#Loan amount vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.AMT_CREDIT_slab.value_counts(normalize=True).plot.barh(color="blue").set_title("AMT_CREDIT_slab(DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.AMT_CREDIT_slab.value_counts(normalize=True).plot.barh(color="red").set_title("AMT_CREDIT_slab(NON-DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.show()
#Age groups vs Defaulting
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
target1.AGE_BINS.value_counts(normalize=True).plot.barh(color="blue").set_title("AGE_BINS(DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.subplot(1,2,2)
target0.AGE_BINS.value_counts(normalize=True).plot.barh(color="red").set_title("AGE_BINS(NON-DEFAULTER)%",
fontsize=10,color="black", pad=30)
plt.show()
target0.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 282686 entries, 1 to 307510 Data columns (total 48 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 282686 non-null int64 1 TARGET 282686 non-null int64 2 NAME_CONTRACT_TYPE 282686 non-null object 3 CODE_GENDER 282686 non-null object 4 FLAG_OWN_CAR 282686 non-null object 5 FLAG_OWN_REALTY 282686 non-null object 6 CNT_CHILDREN 282686 non-null int64 7 AMT_INCOME_TOTAL 282686 non-null float64 8 AMT_CREDIT 282686 non-null float64 9 AMT_ANNUITY 282686 non-null float64 10 AMT_GOODS_PRICE 282686 non-null float64 11 NAME_TYPE_SUITE 282686 non-null object 12 NAME_INCOME_TYPE 282686 non-null object 13 NAME_EDUCATION_TYPE 282686 non-null object 14 NAME_FAMILY_STATUS 282686 non-null object 15 NAME_HOUSING_TYPE 282686 non-null object 16 REGION_POPULATION_RELATIVE 282686 non-null float64 17 DAYS_BIRTH 282686 non-null int64 18 DAYS_EMPLOYED 282686 non-null float64 19 DAYS_REGISTRATION 282686 non-null int64 20 DAYS_ID_PUBLISH 282686 non-null int64 21 FLAG_MOBIL 282686 non-null int64 22 FLAG_EMP_PHONE 282686 non-null int64 23 FLAG_WORK_PHONE 282686 non-null int64 24 FLAG_CONT_MOBILE 282686 non-null int64 25 FLAG_PHONE 282686 non-null int64 26 FLAG_EMAIL 282686 non-null int64 27 OCCUPATION_TYPE 282686 non-null object 28 CNT_FAM_MEMBERS 282686 non-null int64 29 REGION_RATING_CLIENT 282686 non-null int64 30 REGION_RATING_CLIENT_W_CITY 282686 non-null int64 31 WEEKDAY_APPR_PROCESS_START 282686 non-null object 32 HOUR_APPR_PROCESS_START 282686 non-null int64 33 REG_REGION_NOT_LIVE_REGION 282686 non-null int64 34 REG_REGION_NOT_WORK_REGION 282686 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 282686 non-null int64 36 REG_CITY_NOT_LIVE_CITY 282686 non-null int64 37 REG_CITY_NOT_WORK_CITY 282686 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 282686 non-null int64 39 ORGANIZATION_TYPE 282686 non-null object 40 EXT_SOURCE_2 282686 non-null float64 41 EXT_SOURCE_3 282686 non-null float64 42 DAYS_LAST_PHONE_CHANGE 282686 non-null int64 43 INCOME_SLAB 282686 non-null category 44 AMT_CREDIT_slab 282686 non-null category 45 AGE 282686 non-null int64 46 AGE_BINS 282685 non-null category 47 CREDIT_RATIO 282686 non-null int64 dtypes: category(3), float64(8), int64(25), object(12) memory usage: 100.0+ MB
target0.shape
(282686, 48)
WORLING WITH PREVIOUS APPLICATION
previous_application.head()
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | FLAG_LAST_APPL_PER_CONTRACT | NFLAG_LAST_APPL_IN_DAY | RATE_DOWN_PAYMENT | RATE_INTEREST_PRIMARY | RATE_INTEREST_PRIVILEGED | NAME_CASH_LOAN_PURPOSE | NAME_CONTRACT_STATUS | DAYS_DECISION | NAME_PAYMENT_TYPE | CODE_REJECT_REASON | NAME_TYPE_SUITE | NAME_CLIENT_TYPE | NAME_GOODS_CATEGORY | NAME_PORTFOLIO | NAME_PRODUCT_TYPE | CHANNEL_TYPE | SELLERPLACE_AREA | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2030495 | 271877 | Consumer loans | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | SATURDAY | 15 | Y | 1 | 0.0 | 0.182832 | 0.867336 | XAP | Approved | -73 | Cash through the bank | XAP | NaN | Repeater | Mobile | POS | XNA | Country-wide | 35 | Connectivity | 12.0 | middle | POS mobile with interest | 365243.0 | -42.0 | 300.0 | -42.0 | -37.0 | 0.0 |
| 1 | 2802425 | 108129 | Cash loans | 25188.615 | 607500.0 | 679671.0 | NaN | 607500.0 | THURSDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -164 | XNA | XAP | Unaccompanied | Repeater | XNA | Cash | x-sell | Contact center | -1 | XNA | 36.0 | low_action | Cash X-Sell: low | 365243.0 | -134.0 | 916.0 | 365243.0 | 365243.0 | 1.0 |
| 2 | 2523466 | 122040 | Cash loans | 15060.735 | 112500.0 | 136444.5 | NaN | 112500.0 | TUESDAY | 11 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -301 | Cash through the bank | XAP | Spouse, partner | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | high | Cash X-Sell: high | 365243.0 | -271.0 | 59.0 | 365243.0 | 365243.0 | 1.0 |
| 3 | 2819243 | 176158 | Cash loans | 47041.335 | 450000.0 | 470790.0 | NaN | 450000.0 | MONDAY | 7 | Y | 1 | NaN | NaN | NaN | XNA | Approved | -512 | Cash through the bank | XAP | NaN | Repeater | XNA | Cash | x-sell | Credit and cash offices | -1 | XNA | 12.0 | middle | Cash X-Sell: middle | 365243.0 | -482.0 | -152.0 | -182.0 | -177.0 | 1.0 |
| 4 | 1784265 | 202054 | Cash loans | 31924.395 | 337500.0 | 404055.0 | NaN | 337500.0 | THURSDAY | 9 | Y | 1 | NaN | NaN | NaN | Repairs | Refused | -781 | Cash through the bank | HC | NaN | Repeater | XNA | Cash | walk-in | Credit and cash offices | -1 | XNA | 24.0 | high | Cash Street: high | NaN | NaN | NaN | NaN | NaN | NaN |
previous_application.shape
(1670214, 37)
previous_application.isnull().sum()
SK_ID_PREV 0 SK_ID_CURR 0 NAME_CONTRACT_TYPE 0 AMT_ANNUITY 372235 AMT_APPLICATION 0 AMT_CREDIT 1 AMT_DOWN_PAYMENT 895844 AMT_GOODS_PRICE 385515 WEEKDAY_APPR_PROCESS_START 0 HOUR_APPR_PROCESS_START 0 FLAG_LAST_APPL_PER_CONTRACT 0 NFLAG_LAST_APPL_IN_DAY 0 RATE_DOWN_PAYMENT 895844 RATE_INTEREST_PRIMARY 1664263 RATE_INTEREST_PRIVILEGED 1664263 NAME_CASH_LOAN_PURPOSE 0 NAME_CONTRACT_STATUS 0 DAYS_DECISION 0 NAME_PAYMENT_TYPE 0 CODE_REJECT_REASON 0 NAME_TYPE_SUITE 820405 NAME_CLIENT_TYPE 0 NAME_GOODS_CATEGORY 0 NAME_PORTFOLIO 0 NAME_PRODUCT_TYPE 0 CHANNEL_TYPE 0 SELLERPLACE_AREA 0 NAME_SELLER_INDUSTRY 0 CNT_PAYMENT 372230 NAME_YIELD_GROUP 0 PRODUCT_COMBINATION 346 DAYS_FIRST_DRAWING 673065 DAYS_FIRST_DUE 673065 DAYS_LAST_DUE_1ST_VERSION 673065 DAYS_LAST_DUE 673065 DAYS_TERMINATION 673065 NFLAG_INSURED_ON_APPROVAL 673065 dtype: int64
req_columns=['SK_ID_CURR','AMT_APPLICATION','NAME_CASH_LOAN_PURPOSE','NAME_CONTRACT_STATUS','DAYS_DECISION','NAME_PAYMENT_TYPE',
'CODE_REJECT_REASON','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','NAME_PRODUCT_TYPE','CHANNEL_TYPE',
'NAME_YIELD_GROUP']
previous_application=previous_application.loc[:,req_columns]
previous_application.shape
(1670214, 13)
#checking for any missing values
previous_application.isnull().sum()
SK_ID_CURR 0 AMT_APPLICATION 0 NAME_CASH_LOAN_PURPOSE 0 NAME_CONTRACT_STATUS 0 DAYS_DECISION 0 NAME_PAYMENT_TYPE 0 CODE_REJECT_REASON 0 NAME_CLIENT_TYPE 0 NAME_GOODS_CATEGORY 0 NAME_PORTFOLIO 0 NAME_PRODUCT_TYPE 0 CHANNEL_TYPE 0 NAME_YIELD_GROUP 0 dtype: int64
previous_application.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1670214 entries, 0 to 1670213 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 1670214 non-null int64 1 AMT_APPLICATION 1670214 non-null float64 2 NAME_CASH_LOAN_PURPOSE 1670214 non-null object 3 NAME_CONTRACT_STATUS 1670214 non-null object 4 DAYS_DECISION 1670214 non-null int64 5 NAME_PAYMENT_TYPE 1670214 non-null object 6 CODE_REJECT_REASON 1670214 non-null object 7 NAME_CLIENT_TYPE 1670214 non-null object 8 NAME_GOODS_CATEGORY 1670214 non-null object 9 NAME_PORTFOLIO 1670214 non-null object 10 NAME_PRODUCT_TYPE 1670214 non-null object 11 CHANNEL_TYPE 1670214 non-null object 12 NAME_YIELD_GROUP 1670214 non-null object dtypes: float64(1), int64(2), object(10) memory usage: 165.7+ MB
previous_application.DAYS_DECISION.unique()
array([ -73, -164, -301, ..., -1967, -2389, -1], dtype=int64)
#changing values to positive integer
previous_application['DAYS_DECISION']=abs(previous_application['DAYS_DECISION'])
previous_application.NAME_PAYMENT_TYPE.value_counts()
Cash through the bank 1033552 XNA 627384 Non-cash from your account 8193 Cashless from the account of the employer 1085 Name: NAME_PAYMENT_TYPE, dtype: int64
previous_application.NAME_CLIENT_TYPE.value_counts()
Repeater 1231261 New 301363 Refreshed 135649 XNA 1941 Name: NAME_CLIENT_TYPE, dtype: int64
previous_application.NAME_PORTFOLIO.value_counts()
POS 691011 Cash 461563 XNA 372230 Cards 144985 Cars 425 Name: NAME_PORTFOLIO, dtype: int64
#CHANGING XNA TO UNKNOWN
XNA_col=['NAME_PAYMENT_TYPE','NAME_CLIENT_TYPE','NAME_PORTFOLIO']
for i in XNA_col:
previous_application[i]=previous_application[i].str.replace('XMA','unknown')
Univariate analysis ont the previous_application columns
#plotting graph for amt_application
plt.figure(figsize=[20,6])
plt.subplot(1,2,1)
sns.boxplot(previous_application['AMT_APPLICATION']).set_title("AMT_APPLICATION - BOXPLOT", fontsize=20, color='blue', pad=20)
plt.subplot(1,2,2)
sns.distplot(previous_application['AMT_APPLICATION'],color="blue").set_title("AMT_APPLICATION - DISTRIBUTION",
fontsize=20, color='red', pad=20)
plt.show()
#plotting graph for DAYS_DECISION
plt.figure(figsize=[20,6])
plt.subplot(1,2,1)
sns.boxplot(previous_application['DAYS_DECISION']).set_title("DAYS_DECISION - BOX PLOT",fontsize=20,color='blue',pad=20)
plt.subplot(1,2,2)
sns.distplot(previous_application['DAYS_DECISION'],color='red').set_title("DAYS_DECISION - DISTRIBUTION",
fontsize=20,color='blue',pad=20)
plt.show()
#reason for load
plt.figure(figsize=[20,10])
previous_application.NAME_CASH_LOAN_PURPOSE.value_counts(normalize=True).plot.barh(color='blue').set_title("NAME_CASH_LOAN_PURPOSE",
fontsize=20,color='blue',pad=20)
plt.show()
#since most of the values are missing,this column is not useful for our analysis
previous_application.drop(columns=['NAME_CASH_LOAN_PURPOSE'],inplace=True)
previous_application.shape
(1670214, 12)
#Payment method for loan
plt.figure(figsize=[20,10])
previous_application.NAME_PAYMENT_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("Payment method for loan",
fontsize=15,color="black", pad=30)
plt.show()
#kind of goods the client applied for in the previous loan
plt.figure(figsize=[20,10])
previous_application.NAME_GOODS_CATEGORY.value_counts(normalize=True).plot.barh(color="blue").set_title("kind of goods the client applied for in the previous loan",
fontsize=15,color="black", pad=30)
plt.show()
previous_application.drop(columns=['NAME_GOODS_CATEGORY'],inplace=True) #this
previous_application.shape
(1670214, 11)
#was the prev app for cash,pos,car
plt.figure(figsize=[20,10])
previous_application.NAME_PORTFOLIO.value_counts(normalize=True).plot.barh(color="Brown").set_title("REASON FOR PREV APP",
fontsize=15,color="black", pad=20)
plt.show()
#Through which channel we acquired the client on the previous application
plt.figure(figsize=[20,10])
previous_application.CHANNEL_TYPE.value_counts(normalize=True).plot.barh(color="blue").set_title("Through which channel we acquired the client on the previous application",
fontsize=15,color="black", pad=30)
plt.show()
#since this column will not add value to our analysis, we are dropping it....
previous_application.drop(columns=['NAME_YIELD_GROUP'],inplace=True)
previous_application.shape
(1670214, 10)
Merging both new and old dataframes
merged_df=pd.merge(left=application_data_up,right=previous_application,how='inner',on='SK_ID_CURR',suffixes='_x')
merged_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1413701 entries, 0 to 1413700 Data columns (total 57 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SK_ID_CURR 1413701 non-null int64 1 TARGET 1413701 non-null int64 2 NAME_CONTRACT_TYPE 1413701 non-null object 3 CODE_GENDER 1413701 non-null object 4 FLAG_OWN_CAR 1413701 non-null object 5 FLAG_OWN_REALTY 1413701 non-null object 6 CNT_CHILDREN 1413701 non-null int64 7 AMT_INCOME_TOTAL 1413701 non-null float64 8 AMT_CREDIT 1413701 non-null float64 9 AMT_ANNUITY 1413701 non-null float64 10 AMT_GOODS_PRICE 1413701 non-null float64 11 NAME_TYPE_SUITE 1413701 non-null object 12 NAME_INCOME_TYPE 1413701 non-null object 13 NAME_EDUCATION_TYPE 1413701 non-null object 14 NAME_FAMILY_STATUS 1413701 non-null object 15 NAME_HOUSING_TYPE 1413701 non-null object 16 REGION_POPULATION_RELATIVE 1413701 non-null float64 17 DAYS_BIRTH 1413701 non-null int64 18 DAYS_EMPLOYED 1413701 non-null float64 19 DAYS_REGISTRATION 1413701 non-null int64 20 DAYS_ID_PUBLISH 1413701 non-null int64 21 FLAG_MOBIL 1413701 non-null int64 22 FLAG_EMP_PHONE 1413701 non-null int64 23 FLAG_WORK_PHONE 1413701 non-null int64 24 FLAG_CONT_MOBILE 1413701 non-null int64 25 FLAG_PHONE 1413701 non-null int64 26 FLAG_EMAIL 1413701 non-null int64 27 OCCUPATION_TYPE 1413701 non-null object 28 CNT_FAM_MEMBERS 1413701 non-null int64 29 REGION_RATING_CLIENT 1413701 non-null int64 30 REGION_RATING_CLIENT_W_CITY 1413701 non-null int64 31 WEEKDAY_APPR_PROCESS_START 1413701 non-null object 32 HOUR_APPR_PROCESS_START 1413701 non-null int64 33 REG_REGION_NOT_LIVE_REGION 1413701 non-null int64 34 REG_REGION_NOT_WORK_REGION 1413701 non-null int64 35 LIVE_REGION_NOT_WORK_REGION 1413701 non-null int64 36 REG_CITY_NOT_LIVE_CITY 1413701 non-null int64 37 REG_CITY_NOT_WORK_CITY 1413701 non-null int64 38 LIVE_CITY_NOT_WORK_CITY 1413701 non-null int64 39 ORGANIZATION_TYPE 1413701 non-null object 40 EXT_SOURCE_2 1413701 non-null float64 41 EXT_SOURCE_3 1413701 non-null float64 42 DAYS_LAST_PHONE_CHANGE 1413701 non-null int64 43 INCOME_SLAB 1413701 non-null category 44 AMT_CREDIT_slab 1413701 non-null category 45 AGE 1413701 non-null int64 46 AGE_BINS 1413697 non-null category 47 CREDIT_RATIO 1413701 non-null int64 48 AMT_APPLICATION 1413701 non-null float64 49 NAME_CONTRACT_STATUS 1413701 non-null object 50 DAYS_DECISION 1413701 non-null int64 51 NAME_PAYMENT_TYPE 1413701 non-null object 52 CODE_REJECT_REASON 1413701 non-null object 53 NAME_CLIENT_TYPE 1413701 non-null object 54 NAME_PORTFOLIO 1413701 non-null object 55 NAME_PRODUCT_TYPE 1413701 non-null object 56 CHANNEL_TYPE 1413701 non-null object dtypes: category(3), float64(9), int64(26), object(19) memory usage: 597.3+ MB
BIVARIATE/MULTIVARIATE ANALYSIS %of Loan Payment Difficulties
# %of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE
table=pd.pivot_table(merged_df, values='TARGET', index=['NAME_CLIENT_TYPE'],
columns=['NAME_CONTRACT_STATUS'], aggfunc=np.mean)
cm=sns.light_palette("blue", as_cmap=True)
table.style.background_gradient(cmap=cm)
| NAME_CONTRACT_STATUS | Approved | Canceled | Refused | Unused offer |
|---|---|---|---|---|
| NAME_CLIENT_TYPE | ||||
| New | 0.088216 | 0.145205 | 0.110940 | 0.089448 |
| Refreshed | 0.065158 | 0.081098 | 0.117412 | 0.074324 |
| Repeater | 0.072144 | 0.091767 | 0.120596 | 0.083338 |
| XNA | 0.071264 | 0.101377 | 0.135714 | 0.103448 |
table.T.plot(kind='bar').set_ylabel('% of Loan-Payment Difficulties')
plt.title('% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE',fontdict={'fontsize':18},pad=20)
plt.show()
# %of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE
table=pd.pivot_table(merged_df, values='TARGET', index=['NAME_CONTRACT_TYPE'],
columns=['NAME_CONTRACT_STATUS'], aggfunc=np.mean)
cm=sns.light_palette("blue", as_cmap=True)
table.style.background_gradient(cmap=cm)
| NAME_CONTRACT_STATUS | Approved | Canceled | Refused | Unused offer |
|---|---|---|---|---|
| NAME_CONTRACT_TYPE | ||||
| Cash loans | 0.078105 | 0.094178 | 0.123735 | 0.084637 |
| Revolving loans | 0.049836 | 0.058751 | 0.069429 | 0.061972 |
table.T.plot(kind='bar').set_ylabel('% of Loan-Payment Difficulties')
plt.title('% of Loan Payment Difficulties for NAME_CONTRACT_STATUS and NAME_CLIENT_TYPE',fontdict={'fontsize':18},pad=20)
plt.show()
#NAME_HOUSING_TYPE Vs NAME_CONTRACT_STATUS
plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='NAME_HOUSING_TYPE', order=merged_df['NAME_HOUSING_TYPE'].value_counts().index,
hue='NAME_CONTRACT_STATUS').set_title('NAME_HOUSING_TYPE Vs NAME_CONTRACT_STATUS',
fontsize=30, color='blue',pad= 20)
plt.ylabel("NAME_HOUSING_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()
#CODE_GENDER Vs NAME_CONTRACT_STATUS
plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='CODE_GENDER', order=merged_df['CODE_GENDER'].value_counts().index,
hue='NAME_CONTRACT_STATUS').set_title('CODE_GENDER Vs NAME_CONTRACT_STATUS',
fontsize=30, color='blue',pad= 20)
plt.ylabel("CODE_GENDER",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()
#NAME_EDUCATION_TYPE Vs NAME_CONTRACT_STATUS
plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='NAME_EDUCATION_TYPE', order=merged_df['NAME_EDUCATION_TYPE'].value_counts().index,
hue='NAME_CONTRACT_STATUS',palette='crest').set_title('NAME_EDUCATION_TYPE Vs NAME_CONTRACT_STATUS',
fontsize=30, color='blue',pad= 20)
plt.ylabel("NAME_EDUCATION_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'blue'})
plt.show()
#AMT_CREDIT_SLAB Vs NAME_CONTRACT_STATUS
plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='AMT_CREDIT_slab', order=merged_df['AMT_CREDIT_slab'].value_counts().index,
hue='NAME_CONTRACT_STATUS',palette='icefire').set_title('AMT_CREDIT_slab Vs NAME_CONTRACT_STATUS',
fontsize=30, color='Blue',pad= 20)
plt.ylabel("AMT_CREDIT_slab",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()
#AGE_BINS Vs NAME_CONTRACT_STATUS
plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='AGE_BINS', order=merged_df['AGE_BINS'].value_counts().index,
hue='NAME_CONTRACT_STATUS',palette='cubehelix').set_title('AGE_BINS Vs NAME_CONTRACT_STATUS',
fontsize=30, color='blue',pad= 20)
plt.ylabel("NAME_EDUCATION_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'blue'})
plt.show()
#INCOME_SLAB Vs NAME_CONTRACT_STATUS
plt.figure(figsize=[20,10])
sns.countplot(data=merged_df, y='INCOME_SLAB', order=merged_df['INCOME_SLAB'].value_counts().index,
hue='NAME_CONTRACT_STATUS',palette='Paired').set_title('INCOME_SLAB Vs NAME_CONTRACT_STATUS',
fontsize=30, color='Blue',pad= 20)
plt.ylabel("INCOME_SLAB",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()
#Contract type Vs AMT_CREDIT
plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['NAME_CONTRACT_STATUS'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
palette='viridis').set_title("Contract Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("NAME_CONTRACT_STATUS",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()
#INCOME_SLAB Vs AMT_CREDIT
plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['INCOME_SLAB'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
palette='viridis').set_title("INCOME_SLAB Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("INCOME_SLAB",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()
#NAME_INCOME_TYPE Vs AMT_CREDIT
plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['NAME_INCOME_TYPE'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
palette='flare').set_title("NAME_INCOME_TYPE Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("NAME_INCOME_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()
#NAME_CLIENT_TYPE Vs AMT_CREDIT
plt.figure(figsize=[20,10])
sns.boxenplot(x=merged_df['NAME_CLIENT_TYPE'], y=merged_df['AMT_CREDIT'], hue=merged_df['TARGET'],
palette='light:#5A9').set_title("NAME_CLIENT_TYPE Vs AMT_CREDIT", fontsize=30,color='blue',pad=20)
plt.xlabel("NAME_CLIENT_TYPE",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.ylabel("AMT_CREDIT",fontdict={'fontsize':25,'fontweight':5,'color':'Blue'})
plt.show()